{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import codecs\n",
    "import matplotlib.pyplot as plt \n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAENCAYAAAAYIIIKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAFQ9JREFUeJzt3X+0ZWV93/H3BxAVIvJjLpTMTBwSpyphRaGzKMpa1DpWfkgZopIFVp0ltEMSQkDTRrB/YONKq0kaRGNIqCBDQ0AC0pkqEVmosV0WdFCiwGCZIDI3jMyl/DKyoo5++8d5ZnG8XJi7587Z517m/VrrrLP3s5+zv89Z4v3MfvaPk6pCkqTZ2mPcA5AkLSwGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUid7jXsAo7Bo0aJatmzZuIchSQvKHXfc8UhVTeyo3/MyOJYtW8aGDRvGPQxJWlCSfHc2/ZyqkiR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR18ry8c3x395krThx5jZPP/OsZ2y/+y+NHWvc9b795pPuXtGMecUiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqZGTBkeSKJFuT3DXU9odJ7k3yzSQ3Jtl/aNuFSTYl+XaS44faT2htm5JcMKrxSpJmZ5RHHFcCJ0xruwU4oqp+Bfi/wIUASQ4HTgd+uX3mT5PsmWRP4OPAicDhwBmtryRpTEYWHFX1ZeDRaW2fr6ptbfU2YElbXgVcW1U/rKrvAJuAo9trU1XdX1U/Aq5tfSVJYzLOcxxnAtufW7EY2Dy0bbK1PVv7MyRZk2RDkg1TU1MjGK4kCcYUHEn+I7ANuHp70wzd6jnan9lYdVlVraiqFRMTE7tmoJKkZ+j9IYdJVgMnAyuransITAJLh7otAR5qy8/WLkkag16POJKcALwPOKWqnhratB44PckLkxwGLAe+CnwNWJ7ksCR7MziBvr7PMUuSftbIjjiSXAO8HliUZBK4iMFVVC8EbkkCcFtV/XpV3Z3kOuAeBlNY51TVT9p+fgu4GdgTuKKq7h7VmCVJOzay4KiqM2Zovvw5+v8+8PsztN8E3LQLhyZJmgPvHJckdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUyciCI8kVSbYmuWuo7cAktyS5r70f0NqT5KNJNiX5ZpKjhj6zuvW/L8nqUY1XkjQ7ozziuBI4YVrbBcCtVbUcuLWtA5wILG+vNcClMAga4CLgnwNHAxdtDxtJ0niMLDiq6svAo9OaVwFr2/Ja4NSh9qtq4DZg/ySHAscDt1TVo1X1GHALzwwjSVKP+j7HcUhVbQFo7we39sXA5qF+k63t2dolSWMyX06OZ4a2eo72Z+4gWZNkQ5INU1NTu3RwkqSn9R0cD7cpKNr71tY+CSwd6rcEeOg52p+hqi6rqhVVtWJiYmKXD1ySNNB3cKwHtl8ZtRpYN9T+rnZ11THAE20q62bgTUkOaCfF39TaJEljsteodpzkGuD1wKIkkwyujvoQcF2Ss4AHgdNa95uAk4BNwFPAuwGq6tEkHwS+1vr9XlVNP+EuSerRyIKjqs54lk0rZ+hbwDnPsp8rgCt24dAkSXMwX06OS5IWCINDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUidjCY4k70lyd5K7klyT5EVJDktye5L7knwqyd6t7wvb+qa2fdk4xixJGug9OJIsBn4bWFFVRwB7AqcDHwYurqrlwGPAWe0jZwGPVdXLgYtbP0nSmOw1xrovTvJjYB9gC/AG4O1t+1rgA8ClwKq2DHA98CdJUlXV54ClZ/PmG/58pPv/7FvPHun+pa56P+Koqr8H/gh4kEFgPAHcATxeVdtat0lgcVteDGxun93W+h80fb9J1iTZkGTD1NTUaL+EJO3GxjFVdQCDo4jDgJ8H9gVOnKHr9iOKPMe2pxuqLquqFVW1YmJiYlcNV5I0zThOjr8R+E5VTVXVj4FPA68D9k+yfepsCfBQW54ElgK07S8FHu13yJKk7cYRHA8CxyTZJ0mAlcA9wBeBt7U+q4F1bXl9W6dt/4LnNyRpfMZxjuN2Bie5vw58q43hMuB9wHuTbGJwDuPy9pHLgYNa+3uBC/oesyTpaWO5qqqqLgIumtZ8P3D0DH3/ETitj3FJknbMO8clSZ0YHJKkTgwOSVInBockqZNZBUeSW2fTJkl6/nvOq6qSvIjBs6QWtTu+t9/FvR+Du74lSbuZHV2OezZwPoOQuIOng+NJ4OMjHJckaZ56zuCoqkuAS5KcW1Uf62lMkqR5bFY3AFbVx5K8Dlg2/JmqumpE45IkzVOzCo4k/x34JeBO4CetuQCDQ5J2M7N95MgK4HAfLihJmu19HHcB/2SUA5EkLQyzPeJYBNyT5KvAD7c3VtUpIxmVJGnemm1wfGCUg5AkLRyzvarqb0Y9EEnSwjDbq6q+z9O/87038ALgB1W136gGJkman2Z7xPGS4fUkpzLDjy5Jkp7/durpuFX1P4A37OKxSJIWgNlOVb1laHUPBvd1eE+HJO2GZntV1b8eWt4GPACs2uWjkSTNe7M9x/HuUQ9EkrQwzPaHnJYkuTHJ1iQPJ7khyZJRD06SNP/M9uT4J4H1DH6XYzHwP1ubJGk3M9vgmKiqT1bVtva6EpjY2aJJ9k9yfZJ7k2xM8tokBya5Jcl97f2A1jdJPppkU5JvJjlqZ+tKkuZutsHxSJJ3JNmzvd4B/L851L0E+FxVvRJ4NbARuAC4taqWA7e2dYATgeXttQa4dA51JUlzNNvgOBP4NeB7wBbgbcBOnTBPsh9wHHA5QFX9qKoeZ3CV1trWbS1walteBVxVA7cB+yc5dGdqS5LmbrbB8UFgdVVNVNXBDILkAztZ8xeBKeCTSb6R5BNJ9gUOqaotAO394NZ/MbB56POTrU2SNAazDY5fqarHtq9U1aPAkTtZcy/gKODSqjoS+AFPT0vNJDO0PePmwyRrkmxIsmFqamonhyZJ2pHZBsce209WAyQ5kNnfPDjdJDBZVbe39esZBMnD26eg2vvWof5Lhz6/BHho+k6r6rKqWlFVKyYmdvq8vSRpB2YbHP8V+EqSDyb5PeArwB/sTMGq+h6wOckrWtNK4B4Gl/uubm2rgXVteT3wrnZ11THAE9untCRJ/ZvtneNXJdnA4MGGAd5SVffMoe65wNVJ9gbuZ3CifQ/guiRnAQ8Cp7W+NwEnAZuAp9jJk/KSpF1j1tNNLSjmEhbD+7qTwYMSp1s5Q98CztkVdSVJc7dTj1WXJO2+DA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnYwtOJLsmeQbST7T1g9LcnuS+5J8Ksnerf2FbX1T275sXGOWJI33iOM8YOPQ+oeBi6tqOfAYcFZrPwt4rKpeDlzc+kmSxmQswZFkCfBm4BNtPcAbgOtbl7XAqW15VVunbV/Z+kuSxmBcRxwfAX4X+GlbPwh4vKq2tfVJYHFbXgxsBmjbn2j9JUlj0HtwJDkZ2FpVdww3z9C1ZrFteL9rkmxIsmFqamoXjFSSNJNxHHEcC5yS5AHgWgZTVB8B9k+yV+uzBHioLU8CSwHa9pcCj07faVVdVlUrqmrFxMTEaL+BJO3Geg+OqrqwqpZU1TLgdOALVfVvgC8Cb2vdVgPr2vL6tk7b/oWqesYRhySpH/PpPo73Ae9NsonBOYzLW/vlwEGt/b3ABWManyQJ2GvHXUanqr4EfKkt3w8cPUOffwRO63VgkqRnNZ+OOCRJC4DBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqZKyPHJG08065ft2OO83B+retGun+tXB5xCFJ6sTgkCR1YnBIkjoxOCRJnXhyXJJ24N4/fXjkNV75m4eMvMau4hGHJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE68j0PPCyeu+/WR1/jrVX828hrSQtD7EUeSpUm+mGRjkruTnNfaD0xyS5L72vsBrT1JPppkU5JvJjmq7zFLkp42jqmqbcDvVNWrgGOAc5IcDlwA3FpVy4Fb2zrAicDy9loDXNr/kCVJ2/U+VVVVW4Atbfn7STYCi4FVwOtbt7XAl4D3tfarqqqA25Lsn+TQth9JY3DaDXeNdP9/9dYjRrp/zc1YT44nWQYcCdwOHLI9DNr7wa3bYmDz0McmW9v0fa1JsiHJhqmpqVEOW5J2a2MLjiQ/B9wAnF9VTz5X1xna6hkNVZdV1YqqWjExMbGrhilJmmYswZHkBQxC4+qq+nRrfjjJoW37ocDW1j4JLB36+BLgob7GKkn6WeO4qirA5cDGqvrjoU3rgdVteTWwbqj9Xe3qqmOAJzy/IUnjM477OI4F3gl8K8mdre39wIeA65KcBTwInNa23QScBGwCngLe3e9wJUnDxnFV1f9m5vMWACtn6F/AOSMdlCRp1rxzXNKCcd0Nj4x0/7/21kUj3f/zhc+qkiR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjrxkSMj9HcfWzXS/f/Suet23EnSgvbwR7460v0fcv7RnT/jEYckqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ087y/Hnbr0L0ZeY+I33jHyGpI0X3jEIUnqxOCQJHVicEiSOjE4JEmdLJjgSHJCkm8n2ZTkgnGPR5J2VwsiOJLsCXwcOBE4HDgjyeHjHZUk7Z4WRHAARwObqur+qvoRcC0w2kfPSpJmtFCCYzGweWh9srVJknqWqhr3GHYoyWnA8VX1b9v6O4Gjq+rcoT5rgDVt9RXAt+dQchHwyBw+v9DqjrP27lZ3nLX9zrtH7bnUfVlVTeyo00K5c3wSWDq0vgR4aLhDVV0GXLYriiXZUFUrdsW+FkLdcdbe3eqOs7bfefeo3UfdhTJV9TVgeZLDkuwNnA6sH/OYJGm3tCCOOKpqW5LfAm4G9gSuqKq7xzwsSdotLYjgAKiqm4Cbeiq3S6a8FlDdcdbe3eqOs7bfefeoPfK6C+LkuCRp/lgo5zgkSfOEwTEkyRVJtia5q+e6S5N8McnGJHcnOa+nui9K8tUkf9vq/qc+6g7V3zPJN5J8pue6DyT5VpI7k2zose7+Sa5Pcm/73/q1PdV9Rfuu219PJjm/p9rvaf9t3ZXkmiQv6qNuq31eq3v3KL/vTH83khyY5JYk97X3A3qsfVr7zj9NMpKrqwyOn3UlcMIY6m4DfqeqXgUcA5zT0yNVfgi8oapeDbwGOCHJMT3U3e48YGOP9Yb9y6p6Tc+XS14CfK6qXgm8mp6+e1V9u33X1wD/DHgKuHHUdZMsBn4bWFFVRzC4sOX0UddttY8A/h2Dp068Gjg5yfIRlbuSZ/7duAC4taqWA7e29b5q3wW8BfjyiGoaHMOq6svAo2Oou6Wqvt6Wv8/gD8rI74yvgX9oqy9or15OeiVZArwZ+EQf9cYtyX7AccDlAFX1o6p6fAxDWQn8XVV9t6d6ewEvTrIXsA/T7r8aoVcBt1XVU1W1Dfgb4FdHUehZ/m6sAta25bXAqX3VrqqNVTWXG6B3yOCYZ5IsA44Ebu+p3p5J7gS2ArdUVS91gY8Avwv8tKd6wwr4fJI72hMH+vCLwBTwyTY994kk+/ZUe9jpwDV9FKqqvwf+CHgQ2AI8UVWf76M2g391H5fkoCT7ACfxszcRj9ohVbUFBv8wBA7usfbIGRzzSJKfA24Azq+qJ/uoWVU/aVMYS4Cj2yH+SCU5GdhaVXeMutazOLaqjmLwtOVzkhzXQ829gKOAS6vqSOAHjG76Ykbt5tlTgL/qqd4BDP7lfRjw88C+Sd7RR+2q2gh8GLgF+BzwtwymhLULGBzzRJIXMAiNq6vq033Xb9MmX6KfczzHAqckeYDBk47fkOQveqgLQFU91N63MpjrP7qHspPA5NAR3fUMgqRPJwJfr6qHe6r3RuA7VTVVVT8GPg28rqfaVNXlVXVUVR3HYDrnvr5qAw8nORSgvW/tsfbIGRzzQJIwmPveWFV/3GPdiST7t+UXM/g/+r2jrltVF1bVkqpaxmDq5AtV1cu/RJPsm+Ql25eBNzGY1hipqvoesDnJK1rTSuCeUded5gx6mqZqHgSOSbJP+298JT1eDJHk4Pb+CwxOFvf53dcDq9vyamBdj7VHbsHcOd6HJNcArwcWJZkELqqqy3sofSzwTuBb7XwDwPvb3fKjdCiwtv1Q1h7AdVXV66WxY3AIcOPg7xh7AX9ZVZ/rqfa5wNVtyuh+4N091aXN8/8r4Oy+albV7UmuB77OYJroG/R7N/UNSQ4CfgycU1WPjaLITH83gA8B1yU5i0GAntZj7UeBjwETwGeT3FlVx+/Sut45LknqwqkqSVInBockqRODQ5LUicEhSerE4JAkdWJwSDupPWX3jbPoV0levpM1dvqz0qgYHJKkTgwOSVInBoc0R0mOTvJ/kjyeZEuSP2l3hw87Kcn9SR5J8odJ9hj6/Jnth50eS3Jzkpf1/BWkTgwOae5+ArwHWAS8lsEzmX5zWp9fBVYweLDhKuBMgCSnAu9n8CylCeB/0e8zlaTODA5pjqrqjqq6raq2VdUDwJ8D/2Jatw9X1aNV9SCD3yI5o7WfDfyX9uM724D/DLzGow7NZwaHNEdJ/mmSzyT5XpInGfzxXzSt2+ah5e8y+H0KgJcBl7RprscZPKAu9PALkNLOMjikubuUwePol1fVfgymnjKtz/Cvz/0CT/+E6mbg7Kraf+j14qr6yshHLe0kg0Oau5cATwL/kOSVwG/M0Oc/JDkgyVLgPOBTrf3PgAuT/DJAkpcmGckjuKVdxeCQ5u7fA28Hvg/8N54OhWHrgDuAO4HPMvjhLqrqRgY/cXptm+a6i8Ev9Unzlr/HIUnqxCMOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVIn/x/1aMs6XUFhIwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xb0a8fd0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train = pd.read_csv(r'C:\\Users\\Administrator\\Desktop\\training.csv',header=None,encoding=\"utf8\")\n",
    "train.columns = [\"label\",\"text\"]\n",
    "train.head()\n",
    "train.tail()\n",
    "\n",
    "fig=plt.figure()\n",
    "sns.countplot(train.label.values)\n",
    "plt.xlabel('label',fontsize=12)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4774 entries, 0 to 4773\n",
      "Data columns (total 2 columns):\n",
      "label    4774 non-null int64\n",
      "text     4774 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 74.7+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4774.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>5.069543</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.287017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>4.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>11.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             label\n",
       "count  4774.000000\n",
       "mean      5.069543\n",
       "std       2.287017\n",
       "min       1.000000\n",
       "25%       3.000000\n",
       "50%       4.000000\n",
       "75%       6.000000\n",
       "max      11.000000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1       公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2       公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3       公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4       该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...\n",
       "5       公司主营业务为地铁商业物业的租赁与运营管理服务。公司以整体租赁的方式取得轨道交通站点附属建筑...\n",
       "6       作为位于南京市江宁区的小额信贷融资服务供应商，公司专注于满足当地中小微企业、个体经营者及个人...\n",
       "7       公司主要为商业地产商提供服务，主要内容为项目前期的市场调研，项目中期的招商营销策划、营销代理...\n",
       "8       青岛拥湾资产管理集团股份有限公司(简称“拥湾资产”，股票代码834606)拥湾资产成立于20...\n",
       "9       公司的主营业务为提供物业管理服务，公司自成立以来主营业务未发生变化。2013年、2014年及...\n",
       "10      公司提供的主要服务包括：物业管理；家政服务；洗染服务；建筑清洁服务；摄影扩印服务；房地产居间...\n",
       "11      根据《上市公司行业分类指引》（2012年修订），公司所属行业为其他金融业（J69）；根据《国...\n",
       "12      公司是一家从事房地产营销的顾问代理机构，是浙江宁波、舟山两地市场份额较大的房地产综合服务商，...\n",
       "13      公司专业从事高档写字楼、住宅、政府机关大楼等中高档物业项目的管理服务工作，拥有物业服务企业国...\n",
       "14      公司主要为房地产开发商提供传统营销代理服务、顾问策划服务以及基于移动互联网平台的新型代理销售...\n",
       "15      公司以平台信息服务和房屋交易中介服务为支点，业务覆盖信息咨询服务、代理一手房销售、代理二手房...\n",
       "16      公司系一家专注于供应链体系中物流与生产两大产业，以保险产品代理销售为主营业务的专业保险代理销...\n",
       "17      公司主要从事私募股权投资基金管理和自有资金直接股权投资，以及资产管理计划投后管理等业务。公司...\n",
       "18      公司成立于2012年1月，注册资金2.5亿，是由扬州广电集团控股，经江苏省金融办批准成立的组...\n",
       "19      公司以私募股权投资基金管理为主营业务，系国内领先的专注于消费品投资的私募股权投资机构。公司管...\n",
       "20      公司是一家在商办物业领域具备从咨询顾问、物业管理到工商物业运营全产品服务能力的企业，能为商办...\n",
       "21      公司是一家专注于从事保险产品的代理销售的专业保险代理公司。自公司成立以来，公司依托自身平台，...\n",
       "22      经工商登记机关核准，本公司的经营范围包括：面向“三农”发放贷款、提供融资性担保、开展金融机构...\n",
       "23        公司是一家主要涉及新房代理销售、二手房中介品牌特许加盟及运营管理的综合性O2O房地产服务公司。\n",
       "24      根据大连市金融发展局批复，经工商登记机关核准，公司的经营范围及主营业务为：办理各项小额贷款和...\n",
       "25      浏阳市通源小额贷款有限公司，是经浏阳市政府招标，经湖南省人民政府金融工作办公室批准，在工商部...\n",
       "26      公司是按照人民银行、银监会有关规定设立，经山东省政府金融办审批，在济南市高新区辖区内合法经营...\n",
       "27      公司的主营业务为私募股权投资基金管理业务以及股权投资业务。公司从事私募股权投资基金管理业务，...\n",
       "28      公司作为投资型孵化器机构，是天使投资行业的延伸，经营模式为“企业孵化服务+天使投资”，兼具了...\n",
       "29      根据中国证监会《上市公司行业分类指引》（2012年修订）的规定，公司所属行业为“J-69其他...\n",
       "                              ...                        \n",
       "4744    公司经营范围为：高低压开关成套设备，无功控制及补偿系统，配电自动化系统，电力信息管理系统，路...\n",
       "4745    公司主要从事建筑智能化及相关工程的技术服务，公司2014年建筑智能化营业收入占主营业务收入比...\n",
       "4746    公司主营业务是中药饮片生产与销售，隶属于中药饮片加工行业。公司零售业务的主要客户为终端消费者...\n",
       "4747    公司是一家专业致力于企业信息化建设与服务,以软件研发、销售及软件服务为主要业务的科技企业。公...\n",
       "4748    公司经工商部门核准的经营范围为：自动化控制的软、硬件开发、销售；机器人租赁业务；经营进出口业...\n",
       "4749    公司主要业务包括园林绿化施工、园林绿化养护和绿化苗木培育等。公司自成立起即从事园林绿化业务，...\n",
       "4750    公司主营业务主要包括四大业务板块，分别为园林绿化工程及养护、园林景观设计、苗圃生产销售及基地...\n",
       "4751    公司是一家专业从事影视作品后期制作、影视器材设备租赁以及影视剧投资拍摄的企业。公司主营业务为...\n",
       "4752    公司成立开始，是一家专门从事进口铁矿石现货贸易的贸易商，业务主要集中在华北地区，包括河北、天...\n",
       "4753    公司专注于医疗信息化领域的软件系统开发、技术咨询服务、系统集成、大型工程项目实施，为广大用户...\n",
       "4754    公司是一家集科研开发、生产、销售与服务为一体的现代化动物药品高新技术企业。主要从事兽药的研发...\n",
       "4755    公司以构建中国“互联网+”创意产业服务众包生态圈为使命,致力于打造中国创意产业校企服务第一平...\n",
       "4756    公司是一家集电梯部件研发、生产和销售为一体的电梯部件制造企业，主要产品包括机房部件、桥架及井...\n",
       "4757    公司是一家专业提供无纸化会议系统解决方案的公司，主要从事无纸化智能会议系统软件及硬件产品的研...\n",
       "4758    公司经工商部门核准的经营范围为：机器人及自动化装备、AGV、传感器、机器视觉产品的研发、生产...\n",
       "4759    公司主营业务为热处理设备的设计、研发、生产和销售及相关设备配件销售及维修。公司总部设在北京，...\n",
       "4760    公司自成立以来致力于园林绿化事业，主要从事园林工程施工、苗木种植销售及园林绿化养护，主要客户...\n",
       "4761    本公司自成立以来，一直专注于以前沿的软件开发与实施技术，向客户提供资金管理咨询、资金管理软件...\n",
       "4762    公司主要产品为塑料中空板产品，包括中空板周转箱类、包装箱类、广告牌类、瓶托类、果蔬围板类和电...\n",
       "4763    泰克贝思自成立以来一致专注于数字出版领域，主要服务于出版物供应者（出版单位）以及出版物使用者...\n",
       "4764    公司作为工业机器人系统整体解决方案的提供商，为汽车及零部件、工程机械、运动器材等众多行业客户...\n",
       "4765    公司经工商部门核准的经营范围为：生产和销售汽车门板饰条、仪表板饰条、中控制台、组合仪表盖板总...\n",
       "4766    公司主营业务为网站开发技术服务，同时，公司在网站开发技术服务过程中会为有需求的客户提供移动宽...\n",
       "4767    公司经工商部门核准的经营范围为：“抗震产品、综合支吊架、检测设备及相关软件的技术研发、生产与...\n",
       "4768    公司作为一家以整合营销传播见长的专业公关公司，依托多年的公关管理经验、先进的服务理念，为以上...\n",
       "4769    公司的主营业务为药品的研发，制造和销售。根据《上市公司行业分类指引》（2012年修订版），公...\n",
       "4770    医疗器械的经销业务为经销锐珂公司的医用干式胶片产品。北京市卫生局于2007年对医疗机构医用耗...\n",
       "4771    公司以绿色、环保为经营理念，主要从事汽车美容养护产品的研发、生产和销售，目前拥有专利技术4项...\n",
       "4772    公司致力于引进和研发国际先进的基因检测分析技术,在此基础上开发一系列针对精准医疗、健康管理的...\n",
       "4773    公司主营业务为提供第三方工程安全风险管理咨询服务。在整合安全工程、管理工程、土木工程、信息工...\n",
       "Name: text, Length: 4774, dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_file(file_path):\n",
    "    f = codecs.open(file_path,mode = 'r',encoding=\"gbk\")\n",
    "    lines = []\n",
    "    for line in f :\n",
    "        line = line.rstrip('\\n').rstrip('\\r')\n",
    "        lines.append(line)\n",
    "    return lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "stopwords = read_file(r'C:\\Users\\Administrator\\Desktop\\stop.txt')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "stopwords = list(stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\Administrator\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.023 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import codecs\n",
    "import jieba as jb\n",
    "def segment_text(each_row):\n",
    "    return ' '.join([word for word in jb.lcut(each_row['text']) if word not in stopwords])\n",
    "\n",
    "train['text_segmentation'] = train.apply(segment_text,axis=1)\n",
    "####　jb.lcut　把分词的结果当作ｌｉｓｔ来用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['!',\n",
       " '\"',\n",
       " '#',\n",
       " '$',\n",
       " '%',\n",
       " '&',\n",
       " \"'\",\n",
       " '(',\n",
       " ')',\n",
       " '*',\n",
       " '+',\n",
       " ',',\n",
       " '-',\n",
       " '--',\n",
       " '.',\n",
       " '..',\n",
       " '...',\n",
       " '......',\n",
       " '...................',\n",
       " './',\n",
       " '.一',\n",
       " '.数',\n",
       " '.日',\n",
       " '/',\n",
       " '//',\n",
       " '0',\n",
       " '1',\n",
       " '2',\n",
       " '3',\n",
       " '4',\n",
       " '5',\n",
       " '6',\n",
       " '7',\n",
       " '8',\n",
       " '9',\n",
       " ':',\n",
       " '://',\n",
       " '::',\n",
       " ';',\n",
       " '<',\n",
       " '=',\n",
       " '>',\n",
       " '>>',\n",
       " '?',\n",
       " '@',\n",
       " 'A',\n",
       " 'Lex',\n",
       " '[',\n",
       " '\\\\',\n",
       " ']',\n",
       " '^',\n",
       " '_',\n",
       " '`',\n",
       " 'exp',\n",
       " 'sub',\n",
       " 'sup',\n",
       " '|',\n",
       " '}',\n",
       " '~',\n",
       " '~~~~',\n",
       " '·',\n",
       " '×',\n",
       " '×××',\n",
       " 'Δ',\n",
       " 'Ψ',\n",
       " 'γ',\n",
       " 'μ',\n",
       " 'φ',\n",
       " 'φ．',\n",
       " 'В',\n",
       " '—',\n",
       " '——',\n",
       " '———',\n",
       " '‘',\n",
       " '’',\n",
       " '’‘',\n",
       " '“',\n",
       " '”',\n",
       " '”，',\n",
       " '…',\n",
       " '……',\n",
       " '…………………………………………………③',\n",
       " '′∈',\n",
       " '′｜',\n",
       " '℃',\n",
       " 'Ⅲ',\n",
       " '↑',\n",
       " '→',\n",
       " '∈［',\n",
       " '∪φ∈',\n",
       " '≈',\n",
       " '①',\n",
       " '②',\n",
       " '②ｃ',\n",
       " '③',\n",
       " '③］',\n",
       " '④',\n",
       " '⑤',\n",
       " '⑥',\n",
       " '⑦',\n",
       " '⑧',\n",
       " '⑨',\n",
       " '⑩',\n",
       " '──',\n",
       " '■',\n",
       " '▲',\n",
       " '\\u3000',\n",
       " '、',\n",
       " '。',\n",
       " '〈',\n",
       " '〉',\n",
       " '《',\n",
       " '》',\n",
       " '》），',\n",
       " '」',\n",
       " '『',\n",
       " '』',\n",
       " '【',\n",
       " '】',\n",
       " '〔',\n",
       " '〕',\n",
       " '〕〔',\n",
       " '㈧',\n",
       " '一',\n",
       " '一.',\n",
       " '一一',\n",
       " '一下',\n",
       " '一个',\n",
       " '一些',\n",
       " '一何',\n",
       " '一切',\n",
       " '一则',\n",
       " '一则通过',\n",
       " '一天',\n",
       " '一定',\n",
       " '一方面',\n",
       " '一旦',\n",
       " '一时',\n",
       " '一来',\n",
       " '一样',\n",
       " '一次',\n",
       " '一片',\n",
       " '一番',\n",
       " '一直',\n",
       " '一致',\n",
       " '一般',\n",
       " '一起',\n",
       " '一转眼',\n",
       " '一边',\n",
       " '一面',\n",
       " '七',\n",
       " '万一',\n",
       " '三',\n",
       " '三天两头',\n",
       " '三番两次',\n",
       " '三番五次',\n",
       " '上',\n",
       " '上下',\n",
       " '上升',\n",
       " '上去',\n",
       " '上来',\n",
       " '上述',\n",
       " '上面',\n",
       " '下',\n",
       " '下列',\n",
       " '下去',\n",
       " '下来',\n",
       " '下面',\n",
       " '不',\n",
       " '不一',\n",
       " '不下',\n",
       " '不久',\n",
       " '不了',\n",
       " '不亦乐乎',\n",
       " '不仅',\n",
       " '不仅...而且',\n",
       " '不仅仅',\n",
       " '不仅仅是',\n",
       " '不会',\n",
       " '不但',\n",
       " '不但...而且',\n",
       " '不光',\n",
       " '不免',\n",
       " '不再',\n",
       " '不力',\n",
       " '不单',\n",
       " '不变',\n",
       " '不只',\n",
       " '不可',\n",
       " '不可开交',\n",
       " '不可抗拒',\n",
       " '不同',\n",
       " '不外',\n",
       " '不外乎',\n",
       " '不够',\n",
       " '不大',\n",
       " '不如',\n",
       " '不妨',\n",
       " '不定',\n",
       " '不对',\n",
       " '不少',\n",
       " '不尽',\n",
       " '不尽然',\n",
       " '不巧',\n",
       " '不已',\n",
       " '不常',\n",
       " '不得',\n",
       " '不得不',\n",
       " '不得了',\n",
       " '不得已',\n",
       " '不必',\n",
       " '不怎么',\n",
       " '不怕',\n",
       " '不惟',\n",
       " '不成',\n",
       " '不拘',\n",
       " '不择手段',\n",
       " '不敢',\n",
       " '不料',\n",
       " '不断',\n",
       " '不日',\n",
       " '不时',\n",
       " '不是',\n",
       " '不曾',\n",
       " '不止',\n",
       " '不止一次',\n",
       " '不比',\n",
       " '不消',\n",
       " '不满',\n",
       " '不然',\n",
       " '不然的话',\n",
       " '不特',\n",
       " '不独',\n",
       " '不由得',\n",
       " '不知不觉',\n",
       " '不管',\n",
       " '不管怎样',\n",
       " '不经意',\n",
       " '不胜',\n",
       " '不能',\n",
       " '不能不',\n",
       " '不至于',\n",
       " '不若',\n",
       " '不要',\n",
       " '不论',\n",
       " '不起',\n",
       " '不足',\n",
       " '不过',\n",
       " '不迭',\n",
       " '不问',\n",
       " '不限',\n",
       " '与',\n",
       " '与其',\n",
       " '与其说',\n",
       " '与否',\n",
       " '与此同时',\n",
       " '专门',\n",
       " '且',\n",
       " '且不说',\n",
       " '且说',\n",
       " '两者',\n",
       " '严格',\n",
       " '严重',\n",
       " '个',\n",
       " '个人',\n",
       " '个别',\n",
       " '中小',\n",
       " '中间',\n",
       " '丰富',\n",
       " '串行',\n",
       " '临',\n",
       " '临到',\n",
       " '为',\n",
       " '为主',\n",
       " '为了',\n",
       " '为什么',\n",
       " '为什麽',\n",
       " '为何',\n",
       " '为止',\n",
       " '为此',\n",
       " '为着',\n",
       " '主张',\n",
       " '主要',\n",
       " '举凡',\n",
       " '举行',\n",
       " '乃',\n",
       " '乃至',\n",
       " '乃至于',\n",
       " '么',\n",
       " '之',\n",
       " '之一',\n",
       " '之前',\n",
       " '之后',\n",
       " '之後',\n",
       " '之所以',\n",
       " '之类',\n",
       " '乌乎',\n",
       " '乎',\n",
       " '乒',\n",
       " '乘',\n",
       " '乘势',\n",
       " '乘机',\n",
       " '乘胜',\n",
       " '乘虚',\n",
       " '乘隙',\n",
       " '九',\n",
       " '也',\n",
       " '也好',\n",
       " '也就是说',\n",
       " '也是',\n",
       " '也罢',\n",
       " '了',\n",
       " '了解',\n",
       " '争取',\n",
       " '二',\n",
       " '二来',\n",
       " '二话不说',\n",
       " '二话没说',\n",
       " '于',\n",
       " '于是',\n",
       " '于是乎',\n",
       " '云云',\n",
       " '云尔',\n",
       " '互',\n",
       " '互相',\n",
       " '五',\n",
       " '些',\n",
       " '交口',\n",
       " '亦',\n",
       " '产生',\n",
       " '亲口',\n",
       " '亲手',\n",
       " '亲眼',\n",
       " '亲自',\n",
       " '亲身',\n",
       " '人',\n",
       " '人人',\n",
       " '人们',\n",
       " '人家',\n",
       " '人民',\n",
       " '什么',\n",
       " '什么样',\n",
       " '什麽',\n",
       " '仅',\n",
       " '仅仅',\n",
       " '今',\n",
       " '今后',\n",
       " '今天',\n",
       " '今年',\n",
       " '今後',\n",
       " '介于',\n",
       " '仍',\n",
       " '仍旧',\n",
       " '仍然',\n",
       " '从',\n",
       " '从不',\n",
       " '从严',\n",
       " '从中',\n",
       " '从事',\n",
       " '从今以后',\n",
       " '从优',\n",
       " '从古到今',\n",
       " '从古至今',\n",
       " '从头',\n",
       " '从宽',\n",
       " '从小',\n",
       " '从新',\n",
       " '从无到有',\n",
       " '从早到晚',\n",
       " '从未',\n",
       " '从来',\n",
       " '从此',\n",
       " '从此以后',\n",
       " '从而',\n",
       " '从轻',\n",
       " '从速',\n",
       " '从重',\n",
       " '他',\n",
       " '他人',\n",
       " '他们',\n",
       " '他是',\n",
       " '他的',\n",
       " '代替',\n",
       " '以',\n",
       " '以上',\n",
       " '以下',\n",
       " '以为',\n",
       " '以便',\n",
       " '以免',\n",
       " '以前',\n",
       " '以及',\n",
       " '以后',\n",
       " '以外',\n",
       " '以後',\n",
       " '以故',\n",
       " '以期',\n",
       " '以来',\n",
       " '以至',\n",
       " '以至于',\n",
       " '以致',\n",
       " '们',\n",
       " '任',\n",
       " '任何',\n",
       " '任凭',\n",
       " '任务',\n",
       " '企图',\n",
       " '伙同',\n",
       " '会',\n",
       " '伟大',\n",
       " '传',\n",
       " '传说',\n",
       " '传闻',\n",
       " '似乎',\n",
       " '似的',\n",
       " '但',\n",
       " '但凡',\n",
       " '但愿',\n",
       " '但是',\n",
       " '何',\n",
       " '何乐而不为',\n",
       " '何以',\n",
       " '何况',\n",
       " '何处',\n",
       " '何妨',\n",
       " '何尝',\n",
       " '何必',\n",
       " '何时',\n",
       " '何止',\n",
       " '何苦',\n",
       " '何须',\n",
       " '余外',\n",
       " '作为',\n",
       " '你',\n",
       " '你们',\n",
       " '你是',\n",
       " '你的',\n",
       " '使',\n",
       " '使得',\n",
       " '使用',\n",
       " '例如',\n",
       " '依',\n",
       " '依据',\n",
       " '依照',\n",
       " '依靠',\n",
       " '便',\n",
       " '便于',\n",
       " '促进',\n",
       " '保持',\n",
       " '保管',\n",
       " '保险',\n",
       " '俺',\n",
       " '俺们',\n",
       " '倍加',\n",
       " '倍感',\n",
       " '倒不如',\n",
       " '倒不如说',\n",
       " '倒是',\n",
       " '倘',\n",
       " '倘使',\n",
       " '倘或',\n",
       " '倘然',\n",
       " '倘若',\n",
       " '借',\n",
       " '借以',\n",
       " '借此',\n",
       " '假使',\n",
       " '假如',\n",
       " '假若',\n",
       " '偏偏',\n",
       " '做到',\n",
       " '偶尔',\n",
       " '偶而',\n",
       " '傥然',\n",
       " '像',\n",
       " '儿',\n",
       " '允许',\n",
       " '元／吨',\n",
       " '充其极',\n",
       " '充其量',\n",
       " '充分',\n",
       " '先不先',\n",
       " '先后',\n",
       " '先後',\n",
       " '先生',\n",
       " '光',\n",
       " '光是',\n",
       " '全体',\n",
       " '全力',\n",
       " '全年',\n",
       " '全然',\n",
       " '全身心',\n",
       " '全部',\n",
       " '全都',\n",
       " '全面',\n",
       " '八',\n",
       " '八成',\n",
       " '公然',\n",
       " '六',\n",
       " '兮',\n",
       " '共',\n",
       " '共同',\n",
       " '共总',\n",
       " '关于',\n",
       " '其',\n",
       " '其一',\n",
       " '其中',\n",
       " '其二',\n",
       " '其他',\n",
       " '其余',\n",
       " '其后',\n",
       " '其它',\n",
       " '其实',\n",
       " '其次',\n",
       " '具体',\n",
       " '具体地说',\n",
       " '具体来说',\n",
       " '具体说来',\n",
       " '具有',\n",
       " '兼之',\n",
       " '内',\n",
       " '再',\n",
       " '再其次',\n",
       " '再则',\n",
       " '再有',\n",
       " '再次',\n",
       " '再者',\n",
       " '再者说',\n",
       " '再说',\n",
       " '冒',\n",
       " '冲',\n",
       " '决不',\n",
       " '决定',\n",
       " '决非',\n",
       " '况且',\n",
       " '准备',\n",
       " '凑巧',\n",
       " '凝神',\n",
       " '几',\n",
       " '几乎',\n",
       " '几度',\n",
       " '几时',\n",
       " '几番',\n",
       " '几经',\n",
       " '凡',\n",
       " '凡是',\n",
       " '凭',\n",
       " '凭借',\n",
       " '出',\n",
       " '出于',\n",
       " '出去',\n",
       " '出来',\n",
       " '出现',\n",
       " '分别',\n",
       " '分头',\n",
       " '分期',\n",
       " '分期分批',\n",
       " '切',\n",
       " '切不可',\n",
       " '切切',\n",
       " '切勿',\n",
       " '切莫',\n",
       " '则',\n",
       " '则甚',\n",
       " '刚',\n",
       " '刚好',\n",
       " '刚巧',\n",
       " '刚才',\n",
       " '初',\n",
       " '别',\n",
       " '别人',\n",
       " '别处',\n",
       " '别是',\n",
       " '别的',\n",
       " '别管',\n",
       " '别说',\n",
       " '到',\n",
       " '到了儿',\n",
       " '到处',\n",
       " '到头',\n",
       " '到头来',\n",
       " '到底',\n",
       " '到目前为止',\n",
       " '前后',\n",
       " '前此',\n",
       " '前者',\n",
       " '前进',\n",
       " '前面',\n",
       " '加上',\n",
       " '加之',\n",
       " '加以',\n",
       " '加入',\n",
       " '加强',\n",
       " '动不动',\n",
       " '动辄',\n",
       " '勃然',\n",
       " '匆匆',\n",
       " '十分',\n",
       " '千',\n",
       " '千万',\n",
       " '千万千万',\n",
       " '半',\n",
       " '单',\n",
       " '单单',\n",
       " '单纯',\n",
       " '即',\n",
       " '即令',\n",
       " '即使',\n",
       " '即便',\n",
       " '即刻',\n",
       " '即如',\n",
       " '即将',\n",
       " '即或',\n",
       " '即是说',\n",
       " '即若',\n",
       " '却',\n",
       " '却不',\n",
       " '历',\n",
       " '原来',\n",
       " '去',\n",
       " '又',\n",
       " '又及',\n",
       " '及',\n",
       " '及其',\n",
       " '及时',\n",
       " '及至',\n",
       " '双方',\n",
       " '反之',\n",
       " '反之亦然',\n",
       " '反之则',\n",
       " '反倒',\n",
       " '反倒是',\n",
       " '反应',\n",
       " '反手',\n",
       " '反映',\n",
       " '反而',\n",
       " '反过来',\n",
       " '反过来说',\n",
       " '取得',\n",
       " '取道',\n",
       " '受到',\n",
       " '变成',\n",
       " '古来',\n",
       " '另',\n",
       " '另一个',\n",
       " '另一方面',\n",
       " '另外',\n",
       " '另悉',\n",
       " '另方面',\n",
       " '另行',\n",
       " '只',\n",
       " '只当',\n",
       " '只怕',\n",
       " '只是',\n",
       " '只有',\n",
       " '只消',\n",
       " '只要',\n",
       " '只限',\n",
       " '叫',\n",
       " '叫做',\n",
       " '召开',\n",
       " '叮咚',\n",
       " '叮当',\n",
       " '可',\n",
       " '可以',\n",
       " '可好',\n",
       " '可是',\n",
       " '可能',\n",
       " '可见',\n",
       " '各',\n",
       " '各个',\n",
       " '各人',\n",
       " '各位',\n",
       " '各地',\n",
       " '各式',\n",
       " '各种',\n",
       " '各级',\n",
       " '各自',\n",
       " '合理',\n",
       " '同',\n",
       " '同一',\n",
       " '同时',\n",
       " '同样',\n",
       " '后',\n",
       " '后来',\n",
       " '后者',\n",
       " '后面',\n",
       " '向',\n",
       " '向使',\n",
       " '向着',\n",
       " '吓',\n",
       " '吗',\n",
       " '否则',\n",
       " '吧',\n",
       " '吧哒',\n",
       " '吱',\n",
       " '呀',\n",
       " '呃',\n",
       " '呆呆地',\n",
       " '呐',\n",
       " '呕',\n",
       " '呗',\n",
       " '呜',\n",
       " '呜呼',\n",
       " '呢',\n",
       " '周围',\n",
       " '呵',\n",
       " '呵呵',\n",
       " '呸',\n",
       " '呼哧',\n",
       " '呼啦',\n",
       " '咋',\n",
       " '和',\n",
       " '咚',\n",
       " '咦',\n",
       " '咧',\n",
       " '咱',\n",
       " '咱们',\n",
       " '咳',\n",
       " '哇',\n",
       " '哈',\n",
       " '哈哈',\n",
       " '哉',\n",
       " '哎',\n",
       " '哎呀',\n",
       " '哎哟',\n",
       " '哗',\n",
       " '哗啦',\n",
       " '哟',\n",
       " '哦',\n",
       " '哩',\n",
       " '哪',\n",
       " '哪个',\n",
       " '哪些',\n",
       " '哪儿',\n",
       " '哪天',\n",
       " '哪年',\n",
       " '哪怕',\n",
       " '哪样',\n",
       " '哪边',\n",
       " '哪里',\n",
       " '哼',\n",
       " '哼唷',\n",
       " '唉',\n",
       " '唯有',\n",
       " '啊',\n",
       " '啊呀',\n",
       " '啊哈',\n",
       " '啊哟',\n",
       " '啐',\n",
       " '啥',\n",
       " '啦',\n",
       " '啪达',\n",
       " '啷当',\n",
       " '喀',\n",
       " '喂',\n",
       " '喏',\n",
       " '喔唷',\n",
       " '喽',\n",
       " '嗡',\n",
       " '嗡嗡',\n",
       " '嗬',\n",
       " '嗯',\n",
       " '嗳',\n",
       " '嘎',\n",
       " '嘎嘎',\n",
       " '嘎登',\n",
       " '嘘',\n",
       " '嘛',\n",
       " '嘻',\n",
       " '嘿',\n",
       " '嘿嘿',\n",
       " '四',\n",
       " '因',\n",
       " '因为',\n",
       " '因了',\n",
       " '因此',\n",
       " '因着',\n",
       " '因而',\n",
       " '固',\n",
       " '固然',\n",
       " '在',\n",
       " '在下',\n",
       " '在于',\n",
       " '地',\n",
       " '均',\n",
       " '坚决',\n",
       " '坚持',\n",
       " '基于',\n",
       " '基本',\n",
       " '基本上',\n",
       " '处在',\n",
       " '处处',\n",
       " '处理',\n",
       " '复杂',\n",
       " '多',\n",
       " '多么',\n",
       " '多亏',\n",
       " '多多',\n",
       " '多多少少',\n",
       " '多多益善',\n",
       " '多少',\n",
       " '多年前',\n",
       " '多年来',\n",
       " '多数',\n",
       " '多次',\n",
       " '够瞧的',\n",
       " '大',\n",
       " '大不了',\n",
       " '大举',\n",
       " '大事',\n",
       " '大体',\n",
       " '大体上',\n",
       " '大凡',\n",
       " '大力',\n",
       " '大多',\n",
       " '大多数',\n",
       " '大大',\n",
       " '大家',\n",
       " '大张旗鼓',\n",
       " '大批',\n",
       " '大抵',\n",
       " '大概',\n",
       " '大略',\n",
       " '大约',\n",
       " '大致',\n",
       " '大都',\n",
       " '大量',\n",
       " '大面儿上',\n",
       " '失去',\n",
       " '奇',\n",
       " '奈',\n",
       " '奋勇',\n",
       " '她',\n",
       " '她们',\n",
       " '她是',\n",
       " '她的',\n",
       " '好',\n",
       " '好在',\n",
       " '好的',\n",
       " '好象',\n",
       " '如',\n",
       " '如上',\n",
       " '如上所述',\n",
       " '如下',\n",
       " '如今',\n",
       " '如何',\n",
       " '如其',\n",
       " '如前所述',\n",
       " '如同',\n",
       " '如常',\n",
       " '如是',\n",
       " '如期',\n",
       " '如果',\n",
       " '如次',\n",
       " '如此',\n",
       " '如此等等',\n",
       " '如若',\n",
       " '始而',\n",
       " '姑且',\n",
       " '存在',\n",
       " '存心',\n",
       " '孰料',\n",
       " '孰知',\n",
       " '宁',\n",
       " '宁可',\n",
       " '宁愿',\n",
       " '宁肯',\n",
       " '它',\n",
       " '它们',\n",
       " '它们的',\n",
       " '它是',\n",
       " '它的',\n",
       " '安全',\n",
       " '完全',\n",
       " '完成',\n",
       " '定',\n",
       " '实现',\n",
       " '实际',\n",
       " '宣布',\n",
       " '容易',\n",
       " '密切',\n",
       " '对',\n",
       " '对于',\n",
       " '对应',\n",
       " '对待',\n",
       " '对方',\n",
       " '对比',\n",
       " '将',\n",
       " '将才',\n",
       " '将要',\n",
       " '将近',\n",
       " '小',\n",
       " '少数',\n",
       " '尔',\n",
       " '尔后',\n",
       " '尔尔',\n",
       " '尔等',\n",
       " '尚且',\n",
       " '尤其',\n",
       " '就',\n",
       " '就地',\n",
       " '就是',\n",
       " '就是了',\n",
       " '就是说',\n",
       " '就此',\n",
       " '就算',\n",
       " '就要',\n",
       " '尽',\n",
       " '尽可能',\n",
       " '尽如人意',\n",
       " '尽心尽力',\n",
       " '尽心竭力',\n",
       " '尽快',\n",
       " '尽早',\n",
       " '尽然',\n",
       " '尽管',\n",
       " '尽管如此',\n",
       " '尽量',\n",
       " '局外',\n",
       " '居然',\n",
       " '届时',\n",
       " '属于',\n",
       " '屡',\n",
       " '屡屡',\n",
       " '屡次',\n",
       " '屡次三番',\n",
       " '岂',\n",
       " '岂但',\n",
       " '岂止',\n",
       " '岂非',\n",
       " '川流不息',\n",
       " '左右',\n",
       " '巨大',\n",
       " '巩固',\n",
       " '差一点',\n",
       " '差不多',\n",
       " '己',\n",
       " '已',\n",
       " '已矣',\n",
       " '已经',\n",
       " '巴',\n",
       " '巴巴',\n",
       " '带',\n",
       " '帮助',\n",
       " '常',\n",
       " '常常',\n",
       " '常言说',\n",
       " '常言说得好',\n",
       " '常言道',\n",
       " '平素',\n",
       " '年复一年',\n",
       " '并',\n",
       " '并不',\n",
       " '并不是',\n",
       " '并且',\n",
       " '并排',\n",
       " '并无',\n",
       " '并没',\n",
       " '并没有',\n",
       " '并肩',\n",
       " '并非',\n",
       " '广大',\n",
       " '广泛',\n",
       " '应当',\n",
       " '应用',\n",
       " '应该',\n",
       " '庶乎',\n",
       " '庶几',\n",
       " '开外',\n",
       " '开始',\n",
       " '开展',\n",
       " '引起',\n",
       " '弗',\n",
       " '弹指之间',\n",
       " '强烈',\n",
       " '强调',\n",
       " '归',\n",
       " '归根到底',\n",
       " '归根结底',\n",
       " '归齐',\n",
       " '当',\n",
       " '当下',\n",
       " '当中',\n",
       " '当儿',\n",
       " '当前',\n",
       " '当即',\n",
       " '当口儿',\n",
       " '当地',\n",
       " '当场',\n",
       " '当头',\n",
       " '当庭',\n",
       " '当时',\n",
       " '当然',\n",
       " '当真',\n",
       " '当着',\n",
       " '形成',\n",
       " '彻夜',\n",
       " '彻底',\n",
       " '彼',\n",
       " '彼时',\n",
       " ...]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "###词频特征／ＴＦＩＤＦ"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def seg_text(words):\n",
    "    result = jb.cut(words)\n",
    "    new_words = []\n",
    "    for s in result :\n",
    "        if s not in stopwords:\n",
    "            new_words.append(s)\n",
    "    return ''.join(new_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[\"text_segment\"] = train[\"text\"].apply(lambda x :seg_text(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    合晟资产一家专注股票债券二级市场投资合格投资者提供专业资产管理服务企业公司业务范围包括资产管...\n",
       "1                  公司主营业务微企业个体工商户农户客户提供贷款服务设立主营业务未发生变化\n",
       "2    公司立足于商业地产服务致力于商业地产开发销售运营全产业链提供一整套增值服务业务覆盖商业定位策...\n",
       "3    公司工商管理部门核准经营范围投资咨询经济信息咨询企业管理咨询品牌推广策划公共关系策划文化交流...\n",
       "4    公司主营业务中国境内港澳台保险代理销售依托产品研究能力专业化服务能力团体个人保险受众提供投保...\n",
       "Name: text_segment, dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[\"text_segment\"].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 用TFidf进行特征化处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#####先后调用Countvectorizer和tfidftransformer两种方法\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,TfidfTransformer\n",
    "cv = TfidfVectorizer()\n",
    "cv_fit = cv.fit_transform(train['text_segment'])\n",
    "term2id_dict = cv.vocabulary_\n",
    "x = cv_fit.toarray()\n",
    "\n",
    "##重新组成DataFrame,为了可视化|\n",
    "#df_train_tfidf=pd.DataFrame(data = train_tfidf)\n",
    "\n",
    "#df_train_tfidf.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 0. 0. ... 0. 0. 0.]\n",
      " [0. 0. 0. ... 0. 0. 0.]\n",
      " [0. 0. 0. ... 0. 0. 0.]\n",
      " ...\n",
      " [0. 0. 0. ... 0. 0. 0.]\n",
      " [0. 0. 0. ... 0. 0. 0.]\n",
      " [0. 0. 0. ... 0. 0. 0.]]\n"
     ]
    }
   ],
   "source": [
    "print(x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 7127)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# KMeans聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "##一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K,X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K))\n",
    "    ##K-means在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)\n",
    "    ####K值的评估标准\n",
    "    ##本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics. v_measure_score(y_val,y_val_pred)\n",
    "    ##亦可采用无参考默认的评价指标：轮廓铣数 Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    CH_score = metrics.calinski_harabaz_score(X,y_pred)\n",
    "    ##轮廓系数Silhouette Coefficient 在大样本时计算太慢\n",
    "    #si _score = metrics.silhouette_score(X,y_pred)\n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score:{}\".format(si_score))\n",
    "    return CH_score\n",
    "    ##return si_score\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 4.29513045793988\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 2.5521684136418807\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 2.738944777989643\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 2.2742614343222414\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 1.8391725291372751\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 1.5936633428305167\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 1.000109073900396\n"
     ]
    }
   ],
   "source": [
    "###设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5,10,15,20,30,40,50]\n",
    "CH_scores = []\n",
    "#si_score = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K,x)\n",
    "    CH_scores.append(ch)\n",
    "    #si_scores.append(si)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHiVJREFUeJzt3XmUlNWZx/HvQ9NIiwgKDSKLLQKOO0iJrSQRFSOMCEphXOKSqMEl7lET9+gkmeOSYMzMRHE50cR9Q1CiGAGjRpZmVUANQVQCIw0oixgUeeaPWz3dNA1d3V1db9Vbv885dbqWS9eT94Qf1/vexdwdERGJlxZRFyAiIpmncBcRiSGFu4hIDCncRURiSOEuIhJDCncRkRhSuIuIxJDCXUQkhhTuIiIx1DKqL+7YsaOXlZVF9fUiInlp1qxZq9y9tL52kYV7WVkZFRUVUX29iEheMrOP0mmnYRkRkRhSuIuIxJDCXUQkhhTuIiIxpHAXEYkhhbuISAwp3EVEYij/wn3hQrjqKti0KepKRERyVv6F+4cfwpgx8NprUVciIpKz8i/cBw+GXXeFZ5+NuhIRkZyVf+G+004wbBi88AJs3hx1NSIiOSn/wh0gmYTVq+H116OuREQkJ+VnuB9/PJSUwHPPRV2JiEhOys9wb9MGhg6F55+HLVuirkZEJOfkZ7hDGJpZsQLefjvqSkREck7+hvuwYdCqlWbNiIjUIX/Dfddd4bjjwri7e9TViIjklPwNd4CRI+Gjj2D27KgrERHJKfkd7iNGQFGRhmZERGrJ73Dv0AEGDQrhrqEZEZH/l9/hDmHWzAcfwIIFUVciIpIz8j/cTzoJzDQ0IyJSQ9rhbmZFZjbHzF6s47OdzOxJM1tsZtPNrCyTRe5Qly5w5JFarSoiUkNDeu6XA4u289l5wGfu3gsYA9ze1MIaJJmE+fNh8eKsfq2ISK5KK9zNrBtwAvDAdpqMAB5OPX8GONbMrOnlpWnkyPBTQzMiIkD6Pfe7gWuB7W3k0hX4BMDdNwNrgQ5Nri5de+0FiYTCXUQkpd5wN7NhwEp3n7WjZnW8t83cRDMbbWYVZlZRWVnZgDLTkEzCzJnw8ceZ/b0iInkonZ77QGC4mS0FngCOMbM/1WqzDOgOYGYtgXbAmtq/yN3HunvC3ROlpaVNKnwbVUMzzz+f2d8rIpKH6g13d7/O3bu5exlwGjDZ3c+s1Ww8cE7q+ahUm+yuKurTBw48UEMzIiI0YZ67md1mZsNTLx8EOpjZYuAq4GeZKK7Bkkl480343/+N5OtFRHJFg8Ld3ae6+7DU85vdfXzq+b/c/RR37+XuA9x9SXMUW69kMmxDMG5cJF8vIpIr8n+Fak0HHgi9e2toRkQKXrzC3Sz03qdMgTXb3M8VESkY8Qp3CLNmvvkGxo+PuhIRkcjEL9wTCejRQ0MzIlLQ4hfuZqH3PmkSrFsXdTUiIpGIX7hDGHf/6it46aWoKxERiUQ8w/3II2GPPbQNsIgUrHiGe4sW4RCPiRNh48aoqxERybp4hjuEoZmNG+GVV6KuREQk6+Ib7kcdBbvvrlkzIlKQ4hvuxcUwYgRMmACbNkVdjYhIVsU33CEMzaxbB5MnR12JiEhWxTvcBw+Gtm01NCMiBSfe4b7TTjBsWNglcvPmqKsREcmaeIc7hKGZ1avhr3+NuhIRkayJf7gPGQIlJRqaEZGCEv9wb9MGhg4NZ6tu2RJ1NSIiWRH/cIewkdiKFTBtWtSViIhkRWGE+7BhYd67hmZEpEAURri3awfHHRfC3T3qakREml1hhDuEWTMffQSzZ0ddiYhIs6s33M2stZnNMLN5ZrbAzG6to80PzKzSzOamHuc3T7lNMHw4FBVpG2ARKQjp9Nw3Ace4+yFAX2CImZXX0e5Jd++bejyQ0SozoWPHsJmYhmZEpADUG+4ebEi9LE498jMdk0l4/31YuDDqSkREmlVaY+5mVmRmc4GVwKvuPr2OZkkzm29mz5hZ94xWmSknnxzOWNWsGRGJubTC3d2/cfe+QDdggJkdWKvJBKDM3Q8G/gI8XNfvMbPRZlZhZhWVlZVNqbtxunQJR/Ap3EUk5ho0W8bdPwemAkNqvb/a3as2Tb8f6L+dPz/W3RPunigtLW1EuRmQTML8+bB4cTTfLyKSBenMlik1s/ap5yXAYOC9Wm261Hg5HFiUySIz6uSTw0/NmhGRGEun594FmGJm84GZhDH3F83sNjMbnmpzWWqa5DzgMuAHzVNuBpSVQf/+GpoRkVhrWV8Dd58P9Kvj/ZtrPL8OuC6zpTWjZBKuvx4++QS65+a9XxGRpiicFao1JZPhp4ZmRCSmCjPc+/SBAw9UuItIbBVmuEPYBviNN+DTT6OuREQk4wo33JPJsA3BuHFRVyIiknGFG+4HHQS9emnWjIjEUuGGu1novU+ZAmvWRF2NiEhGFW64Qwj3zZthwoSoKxERyajCDvdEIsxz19CMiMRMYYe7WZg1M2kSrF8fdTUiIhlT2OEOYWhm0yZ46aWoKxERyRiF+5FHQufOGpoRkVhRuBcVhZ0iJ06EL7+MuhoRkYxQuEMYd9+4EV55JepKREQyQuEOMGgQ7LabhmZEJDYU7gDFxTBiRJjv/tVXUVcjItJkCvcqySSsXQuvvRZ1JSIiTaZwrzJ4MLRtq22ARSQWFO5VWreGE04Iu0Ru3hx1NSIiTaJwrymZhFWrwj7vIiJ5TOFe09ChUFKiWTMikvcU7jW1aQNDhoRx9y1boq5GRKTR6g13M2ttZjPMbJ6ZLTCzW+tos5OZPWlmi81supmVNUexWZFMwooVMH161JWIiDRaOj33TcAx7n4I0BcYYmbltdqcB3zm7r2AMcDtmS0zi044Icx719CMiOSxesPdgw2pl8Wph9dqNgJ4OPX8GeBYM7OMVZlN7duHaZHPPhvOWBURyUNpjbmbWZGZzQVWAq+6e+0xi67AJwDuvhlYC3TIZKFZlUzC0qUwZ07UlYiINEpa4e7u37h7X6AbMMDMDqzVpK5e+jbdXjMbbWYVZlZRWVnZ8GqzZcSIsFukFjSJSJ5q0GwZd/8cmAoMqfXRMqA7gJm1BNoB25w67e5j3T3h7onS0tJGFZwVHTvCUUdp3F1E8lY6s2VKzax96nkJMBh4r1az8cA5qeejgMnueT5gnUzCe+/BwoVRVyIi0mDp9Ny7AFPMbD4wkzDm/qKZ3WZmw1NtHgQ6mNli4CrgZ81TbhaddFL4qd67iOQhi6qDnUgkvKKiIpLvTtvAgfDFFzB3btSViIgAYGaz3D1RXzutUN2RZBLmzYN//CPqSkREGkThviMjR4afmjUjInlG4b4jZWXQv7/G3UUk7yjc6zNyZNhnZtmyqCsREUmbwr0+yWT4qaEZEckjCvf67LsvHHCAwl1E8orCPR3JZDidaeXKqCsREUmLwj0dyWQ4vGPcuKgrERFJi8I9HQcdBPvso1kzIpI3FO7pMAu998mT4bPPoq5ma19+CV9/HXUVIpJjFO7pSiZh82aYMCHqSmD9enjiiVBThw6w114wY0bUVYlIDlG4p+uww6B79+iGZtavh8ceg5NPhk6d4PTT4W9/gx/8AFq3hu98Bx59NJraRCTntIy6gLxhFhY03XtvCNq2bZv/O9etg/Hj4Zln4OWXYdMm2HNP+NGP4JRT4Mgjw6Eiq1bBqFFw5pnwzjvwy1+G90WkYKnn3hAjR4aAnTix+b5j7Vr44x9h+HAoLYWzzoKKCrjggjAd85NP4J574Nvfrg7wjh1h0qTQ5vbbw3bF69Y1X40ikvPUc2+IgQPDkMizz8Kpp2bu937+ObzwQuihT5oEX30F3brBxReHHnp5ObSo59/hVq3Cf1UcfDBcdlno1Y8fDz17Zq5OEckbCveGKCoKY95/+lOYpVJS0vjf9dlnIdCffhpefTXMeOnRAy65JAyxHH54/YFel4svDqtqTzkl3Cd45hk4+ujG1ykieUnDMg2VTIYDPCZNavifXbMGHnoIhg4N/wXwwx/CggWhpz1tGixdCr/+NRxxROOCvcqxx4bZM507w3e/C7//feN/l4jkJfXcG2rQINhttzA0M2JE/e1Xr4bnnw896NdeC9Mpy8rgyitD7zqRCDdrM61XL3j7bTjjjNCbf+cd+O1vobg4898lIjlH4d5QxcXhZue4cWFsvFWrbdusWhUC/emnw8Knb76BvfeGq64Kgd6/f/MEem3t2oVx9+uvhzvuCAd+P/10mBsvIrGmYZnGSCbDrJbJk6vfq6yE++6DwYNhjz1g9GhYsgSuuQZmzQpH9d1+e/P11LenqCh87yOPwFtvwYABYShIRGJNPffGOO442GUX+MMfwjj500/D1Klhc7FeveDaa0MPvW/f7Ab5jpx1FvTuHW4IH3FEWBA1bFjUVYlIM6m3525m3c1sipktMrMFZnZ5HW0GmdlaM5ubetzcPOXmiNatQzA++SRcdFE4pem662DuXPjgA/jVr6Bfv9wJ9irl5TBzZgj54cNDj9496qpEpBmk03PfDPzE3WebWVtglpm96u4La7V7w90Lpyt4662hZz50aNg1MteCfHu6dQuLoc49F372M3j3Xbj//vAPlojERr3h7u4rgBWp5+vNbBHQFagd7oWlTx/46U+jrqJxdt4ZHn8cDjwQbrop/NfGuHHQpUvUlYlIhjTohqqZlQH9gOl1fHyEmc0zsz+b2QEZqE2akxnceGM4PnDBgnCjt6Ii6qpEJEPSDncz2wV4FrjC3WtvXDIb2MvdDwF+B9R5ZJGZjTazCjOrqKysbGzNkkknnxx2lywuDvvVPPFE1BWJSAakFe5mVkwI9kfdfZuTot19nbtvSD2fCBSbWcc62o1194S7J0pLS5tYumTMwQeHFa2HHRa2Er7hhjDzR0TyVjqzZQx4EFjk7r/ZTps9Uu0wswGp37s6k4VKM+vUCf7yFzj//DDbZ+TIsLWxiOSldGbLDATOAt4xs7mp964HegC4+73AKOAiM9sMfAmc5q45dnmnVSsYOzbM/rnyyrAL5gsvhNW1IpJXLKoMTiQSXqEbeLnr1Vfhe9+Dli3DvjhHHRV1RSICmNksd0/U107bD0jdjjsOpk8P+9AMHhx69CKSNxTusn19+oStiAcPDqc8XXpp2NVSRHKewl12rH17ePHFsKPlf/0XDBkS9qUXkZymcJf6FRWFQ0QeeihsXXD44bBoUdRVicgOKNwlfT/8IUyZEg7fLi+HP/856opEZDsU7tIwRx4Zdpbs2TPsjPnrX2tnSZEcpHCXhuvRA958M2xdcPXVoUf/r39FXZWI1KBwl8Zp0waeegp+/nN4+GE4+mj45JOoqxKRFIW7NF6LFnDLLeEkqnnzwkrWE08M2wd//XXU1YkUNIW7NN2oUWHb4KrzYk8+ORwKcs014VBuEck6hbtkxt57w3/+J3z8MUyYEG683n037Ldf2KPmoYdgw4aoqxQpGAp3yayWLcMsmuefD2fL3nEHrF4N550He+wRfv7tb5phI9LMFO7SfDp3DkMzixbBW2/BqaeGQ8UHDoT994e77oJPP426SpFYUrhL8zMLwzQPPggrVsADD8Buu4Xg79YtjNG/+KL2rRHJIIW7ZFfbttVDMwsXwhVXhOcnnhjmz193Hfz971FXKZL3FO4Snf32gzvvDGPzzz0H/fuHMfo+fcL+8Y88Ahs3Rl2lSF5SuEv0iovD0MyECWEh1K9+BcuXwznnQJcucOGF4YxX3YQVSZvCXXLLnnuGoZkPPoCpU+Gkk0IP/vDDw0Hed98Nq1ZFXaVIzlO4S24yC0MzDz8cbsLeey+UlISzXffcMxwB+PLL8M03UVcqkpMU7pL72rULJ0HNmAHz58PFF8PkyTB0aFg8dfPN8OGHUVcpklMU7pJfDjooDM38859h47L994df/CJsQXzssfDYY9qhUoQ0wt3MupvZFDNbZGYLzOzyOtqYmd1jZovNbL6ZHdo85Yqk7LQTnHJKGJpZuhRuuw2WLIHvfz/chL3kEpgzJ+oqRSKTTs99M/ATd98PKAd+bGb712ozFOideowGfp/RKkV2pEcPuOkm+Mc/4C9/CcM1DzwAhx4K/fqFs18/+yzqKkWyqt5wd/cV7j479Xw9sAjoWqvZCOARD6YB7c2sS8arFdmRFi2qh2aWLw+hbgaXXhp682ecEcJ/y5aoKxVpdg0aczezMqAfML3WR12Bmic1LGPbfwBEsmf33eHHP4bZs8Pj/PPDma/HHQf77BOGcT7+OOoqRZpN2uFuZrsAzwJXuPu62h/X8Ue2WXFiZqPNrMLMKiorKxtWqUhjVQ3NLF8eevX77BMOGSkrgyFDwo3ZTZuirlIko9IKdzMrJgT7o+7+XB1NlgHda7zuBiyv3cjdx7p7wt0TpaWljalXpPFKSuD008PQzJIlcOON4ZCRU0+Frl3DPjfvvBN1lSIZkc5sGQMeBBa5+2+202w8cHZq1kw5sNbdV2SwTpHM2nvvMDSzdGmYcXPMMfA//xNWwQ4YEBZNrV0bdZUijZZOz30gcBZwjJnNTT3+3cwuNLMLU20mAkuAxcD9wMXNU65IhhUVwfHHh6GZ5cthzBj48ku46KJwE/bss+H117WvjeQd84j+T5tIJLyioiKS7xbZIXeoqAj7zz/+OKxbF8bpzz03bGbWVXMFJDpmNsvdE/W10wpVkdrM4LDDwtDMihVhf5uuXeGGG8Kc+qpjBL/6KupKRbZL4S6yIzvvXD0088EH8NOfhqmVI0eGU6SuvjocIyiSYxTuIunq3TvsNf/xx2Hv+W99C37727C/TdUxguvXR12lCKBwF2m4li3D0Mxzz4VTpO68M2xvcP754SbsuefCG2/oTFiJlG6oimSCO0ybFnrvTzwBX3wRhnT694fy8vA4/HDdjJUmS/eGqsJdJNM2bICXXoK33w6BP2dO9c3Xbt2qw768PGxuVlISbb2SVxTuIrli0yaYOzcE/bRpMH169eEiLVvCIYds3bvv1SvM2BGpg8JdJJd9+mkI+aqwnzEj9PgBOnQIIX/44SHwBwyA9u2jrVdyRrrh3jIbxYhILZ07w/Dh4QHhLNiFC6vDftq0sItlVedrv/2qw768HA44IPT6RbZDPXeRXLV2LcycWR3206bBqlXhszZtIJHYejini45QKAQalhGJG/ewm2XNsJ87F77+Onzeo8fWN2v79YPWraOtWTJO4S5SCP71rzAbpyrsp02rPoSkuBj69t26d9+zp27W5jmFu0ihWrFi6979zJmwcWP4rGPHrcP+sMOgXbto65UGUbiLSLB5cziUpOZUzKr9cMzC9glVYV9eHl4XFUVbs2yXwl1Etu/zz8P0y5qzc9asCZ/tskvo0dfs4XfuHG298v8U7iKSPndYvHjrsJ83r3p/nL333noqZt++sNNO0dZcoBTuItI0GzeG7Y1rjt8vWxY+a9UqzMap2bsvK9PN2ixQuItI5v3zn1uHfUVFOJYQoFOnbW/Wtm0bbb0xpHAXkeb39dfw7rtb36x9//3wWYsWYSVteTl897vhgJMW2mW8qRTuIhKNNWu2vVn7+edw0EHwi1/AiSdq+KYJdIaqiERj991hyBD4+c/D/jirV4eDxr/8EkaMgIEDYerUqKuMvXrD3cweMrOVZvbudj4fZGZrzWxu6nFz5ssUkbzVogWcdlrYGG3s2LCC9uij4fjjYdasqKuLrXR67n8AhtTT5g1375t63Nb0skQkdoqL4Uc/gr//He66KwR7IgGjRsF770VdXezUG+7u/ldgTRZqEZFCUFICP/lJ2ATtllvglVfCjddzz4WPPoq6utjI1Jj7EWY2z8z+bGYHZOh3ikic7bprGJdfsgQuvxweewz69AnPV66Murq8l4lwnw3s5e6HAL8Dxm2voZmNNrMKM6uorKzMwFeLSN4rLYXf/CYM15x9Nvz3f4fdK2+6KexpL43S5HB393XuviH1fCJQbGYdt9N2rLsn3D1RWlra1K8WkTjp3h3uvz9scnbCCWHaZM+ecOed1QulJG1NDncz28MsTFo1swGp37m6qb9XRArUvvvCk0+GG64DBsC114ZDw++9t/pgEqlXOlMhHwfeBvY1s2Vmdp6ZXWhmF6aajALeNbN5wD3AaR7VyigRiY9DDw3z5F9/Pexbc9FF4SzZxx6DLVuiri7naYWqiOQ+d5g4EW64IexWedBB8MtfwrBhBbfaVStURSQ+zMI4/OzZ1atdhw+Hb30r9OxlGwp3EckfNVe73ncfLF0KgwaF7Q602nUrCncRyT/FxTB6dDhg5K67wtbDiQSccopWu6Yo3EUkf9Vc7XrzzfDyy2G163nnhT1sCpjCXUTy3667wq23hpC/7DL405+gd2+44oqCXe2qcBeR+CgthTFjwmrXs86C3/0O9tkn9OoLbLWrwl1E4qdHD3jggXDjdehQ+I//KLjVrgp3EYmvffeFp57adrXrfffFfrWrwl1E4q/2atcLL4T99w9z5mO62lXhLiKF4zvfgTffhAkTYOed4YwzoF8/ePHFsAo2RhTuIlJYzMK2BXPmhH1qvvgiHNods9WuCncRKUwtWsDpp8OiRWHHyZqrXWfPjrq6JlO4i0hhKy6GCy4Iq13vvBNmzoT+/eF734P334+6ukZTuIuIQFjtevXVYSHUTTeFXSgPOADOPz8vV7sq3EVEamrXDm67LYT8pZfCH/8YVrteeSXk0fGgCncRkbp06lS92vXMM+Gee8JCqFtuyYvVrgp3EZEd6dEDHnwwnO06ZEjo1ffsGXajzOHVrgp3EZF0/Nu/wdNPh+2FDzsMrrkmDNeMHZuTq10V7iIiDdG/f9haeOrU0Ku/4IKcXO2qcBcRaYyjjoK33tp6teuhh8JLL+XEaleFu4hIY9Vc7froo7BhQ3j97W/DG29EWlq94W5mD5nZSjN7dzufm5ndY2aLzWy+mR2a+TJFRHJYixah51612vXDD8M+NkOHhuCPoqQ02vwBGLKDz4cCvVOP0cDvm16WiEgeqrna9Y47YMaMMFRz6qnwwQdZLaXecHf3vwJrdtBkBPCIB9OA9mbWJVMFiojknZKSMJtmyRK48cYwDr///mG16yefZKWETIy5dwVqVrss9Z6ISGFr1y6cArVkCVxySfVq1zFjmv2rMxHuVsd7dd4qNrPRZlZhZhWVebSMV0SkSTp1grvvDkMz3/9+ODCkmbXMwO9YBnSv8bobsLyuhu4+FhgLkEgkop8rJCKSTXvtFVa7ZkEmeu7jgbNTs2bKgbXuviIDv1dERBqp3p67mT0ODAI6mtky4BagGMDd7wUmAv8OLAY2Aj9srmJFRCQ99Ya7u59ez+cO/DhjFYmISJNphaqISAwp3EVEYkjhLiISQwp3EZEYUriLiMSQeUT7DptZJfBRJF+eOR2BVVEXkUN0Pbam61FN12JrTbkee7l7aX2NIgv3ODCzCndPRF1HrtD12JquRzVdi61l43poWEZEJIYU7iIiMaRwb5qxUReQY3Q9tqbrUU3XYmvNfj005i4iEkPquYuIxJDCPU11HRRuZrub2atm9vfUz92irDFbzKy7mU0xs0VmtsDMLk+9X6jXo7WZzTCzeanrcWvq/b3NbHrqejxpZq2irjVbzKzIzOaY2Yup14V8LZaa2TtmNtfMKlLvNfvfFYV7+v7AtgeF/wx4zd17A6+lXheCzcBP3H0/oBz4sZntT+Fej03AMe5+CNAXGJI62+B2YEzqenwGnBdhjdl2ObCoxutCvhYAR7t73xrTH5v974rCPU3bOSh8BPBw6vnDwElZLSoi7r7C3Wennq8n/CXuSuFeD3f3DamXxamHA8cAz6TeL5jrYWbdgBOAB1KvjQK9FjvQ7H9XFO5N07nq1KnUz04R15N1ZlYG9AOmU8DXIzUMMRdYCbwK/AP43N03p5oU0sHxdwPXAltSrztQuNcCwj/0k8xslpmNTr3X7H9XMnGGqhQoM9sFeBa4wt3XhQ5aYXL3b4C+ZtYeeB7Yr65m2a0q+8xsGLDS3WeZ2aCqt+toGvtrUcNAd19uZp2AV83svWx8qXruTfOpmXUBSP1cGXE9WWNmxYRgf9Tdn0u9XbDXo4q7fw5MJdyLaG9mVR2o7R4cHzMDgeFmthR4gjAcczeFeS0AcPflqZ8rCf/wDyALf1cU7k0zHjgn9fwc4IUIa8ma1Bjqg8Aid/9NjY8K9XqUpnrsmFkJMJhwH2IKMCrVrCCuh7tf5+7d3L0MOA2Y7O7fpwCvBYCZtTGztlXPge8C75KFvytaxJSmmgeFA58SDgofBzwF9AA+Bk5x99o3XWPHzL4FvAG8Q/W46vWEcfdCvB4HE26KFRE6TE+5+21m1pPQe90dmAOc6e6boqs0u1LDMle7+7BCvRap/93Pp162BB5z91+aWQea+e+Kwl1EJIY0LCMiEkMKdxGRGFK4i4jEkMJdRCSGFO4iIjGkcBcRiSGFu4hIDCncRURi6P8AltXasnILOeYAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1169ee80>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks,np.array(CH_scores),'r-',label='CH_scores')\n",
    "\n",
    "index = np.unravel_index(np.argmax(CH_scores,axis = None),len(CH_scores))\n",
    "\n",
    "##最佳超参数\n",
    "Best_K = Ks[index[0]]\n",
    "print (Best_K)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 用最佳的K再次聚类，得到聚类结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "mb_kmeans = MiniBatchKMeans(n_clusters= Best_K)\n",
    "y_pred = mb_kmeans.fit_predict(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 3, 3, ..., 3, 3, 3])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存聚类的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存聚类结果\n",
    "feat_names_Kmeans = 'Kmeans_' + str(Best_K)\n",
    "y = pd.Series(data = train['text'], name = 'target')\n",
    "train_kmeans = pd.concat([pd.Series(name = feat_names_Kmeans,data = y_pred),y],axis = 1)\n",
    "train_kmeans.to_csv('company_classification_train_KMeans.csv',index = False,header = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 保存Kmeans模型，用于后续对测试数据的聚类\n",
    "import _pickle as cPickle\n",
    "cPickle.dump(mb_kmeans,open('mb_kmeans.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
